Load required packages and read in combined data.

#packages
pacman::p_load(dplyr, 
               tidyr, 
               ggplot2, 
               rjson,
               rdatacite,
               cowplot, 
               stringr, 
               knitr, 
               DT)


#Load the combined data from 3_Combined_data.R
load(file="data_rdata_files/Combined_ALL_data.Rdata")

#subset the data to published years >= 2012
all_dois <- combined_dois %>% 
  filter(publicationYear >= 2012) 

All Metadata combined

Look at dois by their origin (all types)

all_dois %>% 
  group_by(group) %>% 
  summarize(count=n()) %>% 
  kable()
group count
Affiliation - CrossRef 147702
Affiliation - Datacite 51053
IR_publisher 24104

General data cleaning

#DRUM is inconsistently specified (with and without DRUM)
all_dois$publisher[grep("Data Repository for the University of Minnesota", all_dois$publisher)] <- "Data Repository for the University of Minnesota (DRUM)"


#Remove morphosource data, as affiliation isn't included
all_dois2 <- all_dois[-which(all_dois$publisher_plus == "Duke-MorphoSource Media"),]

#make sure dataset is capitalized in all metadata resource types
all_dois2[which(all_dois2$resourceTypeGeneral == "dataset"),]$resourceTypeGeneral <- "Dataset"

Collapse IRs into a single category

Look at all the Institutional Repositories Captured

IR_pubs <- all_dois2 %>% 
  filter(group == "IR_publisher") %>% 
  group_by(publisher_plus) %>% 
  summarize(count = n()) 

IR_pubs %>% 
  kable(col.names = c("Institutional Repository", "Count"))
Institutional Repository Count
Cornell 4758
Duke-Duke Digital Repository 76
Duke-Research Data Repository, Duke University 147
Michigan 10
Michigan-Deep Blue 637
Michigan-ICPSR/ISR 109
Michigan-Other 57
Minnesota 692
Virginia Tech 333
Washington U 4085

Replace all of these publishers with “Institutional Repository” so that they will be represented in a single bar.

all_dois2$publisher[which(all_dois2$publisher_plus %in% unique(IR_pubs$publisher_plus))] <- "Institutional Repository"

#catch the rest of the "Cornell University Library"
all_dois2$publisher[which(all_dois2$publisher == "Cornell University Library")] <- "Institutional Repository"

#and stray VT
all_dois2$publisher[which(all_dois2$publisher == "University Libraries, Virginia Tech")] <- "Institutional Repository"

#and DRUM
all_dois2$publisher[which(all_dois2$publisher == "Data Repository for the University of Minnesota (DRUM)")] <- "Institutional Repository"

##ICPSR is also inconsistent
all_dois2$publisher[grep("Consortium for Political", all_dois$publisher)] <- "ICPSR"

Overall counts by resource type

Counts by resource type

by_resource <- all_dois2 %>% 
  group_by(institution, resourceTypeGeneral) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count)) 

Create a table of top resources

by_resource_table <-  by_resource %>% 
  #filter(resourceTypeGeneral %in% c("Dataset", "Software", "Text", "Image")) %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>%  
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_resource_table %>% 
  datatable

Write out the resources

write.csv(by_resource_table, file = "data_summary_data/Counts of Resource Types by Insitution.csv", row.names = F)

Data specific DOIs

Subset to only datasets

data_dois <- all_dois2 %>% 
  filter(resourceTypeGeneral == "Dataset") 

Data DOIs by publisher

by_publisher_data <- data_dois %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))


by_publisher_data_table <- by_publisher_data %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_data_table %>% 
  datatable()

Write out the table of data publishers

write.csv(by_publisher_data_table, file="data_summary_data/Counts of Data Publishers By Insitituion.csv", row.names = F)

After reviewing the repositories, we will remove the Faculty Opinions LTD records (these are reviews of articles, no data attached) and the ENCODE Data Coordination Center (extreme outlier for Michigan in 2022, unclear what the level of DOI assignment is) from further analysis.

Software specific DOIs

Subset to only software (only datacite has software)

software_dois <- all_dois2 %>% 
  filter(resourceTypeGeneral == "Software")
by_publisher_software <- software_dois %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

by_publisher_software_table <- by_publisher_software %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_software_table %>% 
  datatable

Write out the table of software publishers

write.csv(by_publisher_software_table, file="data_summary_data/Counts of Software Publishers By Insitituion.csv", row.names = F)

Graphs

Top publishers - Data DOIs

Plot publishers by rank, ordered from most DOIs to least (take top 20). Remove Encode and Faculty opinions LTD from the list.

by_publisher_data <- by_publisher_data %>% 
   filter(publisher != "ENCODE Data Coordination Center", 
         publisher != "Faculty Opinions Ltd") 

by_publisher_data_table <- by_publisher_data_table %>% 
  filter(publisher != "ENCODE Data Coordination Center", 
         publisher != "Faculty Opinions Ltd") 
  
by_publisher_data %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,20), n.breaks = 20) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
  coord_cartesian(xlim = c(1,20)) +
  theme_bw() 

Based on the graph above, it appears that there is a large drop off after the top 7 publishers. If we look at the top 7 publishers for the data dois, how many DOIs does this cover?

top7pubs <- by_publisher_data_table$publisher[1:7]

by_publisher_data %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  mutate(intop7pub = publisher %in% top7pubs) %>% 
  group_by(intop7pub) %>% 
  summarize(totalDOIs = sum(count), nrepos = n()) %>% 
  ungroup() %>% 
  mutate(propDOIs = totalDOIs/sum(totalDOIs)) %>% 
  kable(col.names =  c("In Top 7 Publishers", "Total N DOIs", "Total N Publishers", "Proportion of Total DOIs"))
In Top 7 Publishers Total N DOIs Total N Publishers Proportion of Total DOIs
FALSE 2274 159 0.0779461
TRUE 26900 7 0.9220539

Plotting Number of DOIs in the top 8 publishers by institution

top7colors <- c("Harvard Dataverse" = "dodgerblue2",
                "Zenodo" = "darkorange1",
                "ICPSR" = "darkcyan",
                "Dryad" = "lightgray", 
                "Qualitative Data Repository" = "gold1",
                "figshare" = "purple", 
                "Institutional Repository" = "lightblue")


(by_publisher_data_plot <-  by_publisher_data %>% 
    filter(publisher %in% top7pubs) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top7colors, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
    coord_cartesian(ylim = c(0,5000)) +
    labs(x = "Institution", y="Count of Data DOIs", caption = "Note: Michigan Dataverse bar cutoff for scaling") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_data_plot, filename = "figures/Counts of Data DOIs by Institution - ForPaper.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Top publishers - Software DOIs

Look at the top software publishers (This excludes CrossRef affiliation data, as software is not a resource type).

by_publisher_software %>% 
  group_by(publisher) %>% 
  summarize(count=sum(count)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,20), n.breaks = 20) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of Software DOIs by top Publishers")+
  coord_cartesian(xlim = c(1,20)) +
  theme_bw() 

It looks like there is one primary software publisher, but we could also take the top 4 or 5 capture the majority.

top6pubs_soft <- by_publisher_software_table$publisher[1:6]

top6colors_soft <- c("Zenodo" = "darkorange1",
                     "Code Ocean" = "darkblue",
                     "Institutional Repository" = "lightblue",
                     "Optica Publishing Group" = "red", 
                     "CoMSES Net" = "pink", 
                     "figshare" = "purple")


(by_publisher_software_plot <-  by_publisher_software %>% 
    filter(publisher %in% top6pubs_soft) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top6colors_soft, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    labs(x = "Institution", y="Count of Software DOIs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_software_plot, filename = "figures/Counts of Software DOIs by Institution.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs - Software

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Collapsed DOIs

Some repositories (such as Harvard’s Dataverse and Qualitative Data Repository) assign DOIs at the level of the file, rather than the study. Similarly, Zenodo often has many related DOIs for multiple figures within a study. In order to attempt to compare study-to-study counts of data sharing, look at the DOIs collapsed by “container”.

by_container <- 
all_dois2 %>% 
  filter(!is.na(container_identifier)) %>% 
  group_by(container_identifier, publisher, title, institution) %>% 
  summarize(count=n()) %>% 
  arrange(desc(count))

How many publishers have container DOIs?

by_container %>% 
  group_by(publisher) %>% 
  summarize(count=n()) %>% 
  arrange(desc(count)) %>% 
  datatable

Collapsing by container for counts

containerdups <- which(!is.na(all_dois2$container_identifier) & duplicated(all_dois2$container_identifier))

all_dois_collapsed <- all_dois2[-containerdups,]

Overall Count of Data DOIs

Faculty Opinions LTD and ENCODE Data Coordination Center are removed from this analysis as well.

data_dois_collapse <- all_dois_collapsed %>% 
  filter(resourceTypeGeneral == "Dataset") %>% 
  filter(publisher != "ENCODE Data Coordination Center", 
         publisher != "Faculty Opinions Ltd") 

  

by_publisher_data_collapse <- data_dois_collapse %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

Table of publisher counts

by_publisher_data_collapse_table <- by_publisher_data_collapse %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

by_publisher_data_collapse_table %>% 
  datatable

Write out the table of data publishers

write.csv(by_publisher_data_collapse_table, file="data_summary_data/Counts of Data Publishers By Insitituion - Collapsed by container.csv", row.names = F)

Graphs

Top 7 publishers of data dois

by_publisher_data_dc_collapse <- data_dois_collapse %>% 
  group_by(publisher, institution) %>% 
  summarize(count=n()) %>% 
  arrange(institution, desc(count))

#table of  publishers - data
by_publisher_data_dc_collapse_table <- by_publisher_data_dc_collapse %>% 
  pivot_wider(names_from = institution, 
              values_from = count, 
              values_fill = 0) %>% 
  rowwise %>% 
  mutate(Total = sum(c_across(Cornell:`Washington U`))) %>% 
  arrange(desc(Total))

Look at publishers based on rank of number of DOIs

by_publisher_data_dc_collapse_table %>% 
  group_by(publisher) %>% 
  summarize(count=sum(Total)) %>% 
  arrange(desc(count)) %>% 
  mutate(pubrank = order(count, decreasing = T)) %>% 
  ggplot(aes(x=pubrank, y=count)) +
  geom_bar(stat="identity") +
  scale_x_continuous(limits = c(0,25)) +
  labs(x = "Publisher Rank", y="Number of DOIs", title="Number of DOIs by top Publishers")+
  theme_bw() 

Look at the top 7 publishers - how many does this capture?

top7pubs <- by_publisher_data_dc_collapse_table$publisher[1:7]

by_publisher_data_dc_collapse_table %>% 
  group_by(publisher) %>% 
  summarize(count=sum(Total)) %>% 
  mutate(intop7pub = publisher %in% top7pubs) %>% 
  group_by(intop7pub) %>% 
  summarize(totalDOIs = sum(count), nrepos = n()) %>% 
  ungroup() %>% 
  mutate(propDOIs = totalDOIs/sum(totalDOIs))
## # A tibble: 2 × 4
##   intop7pub totalDOIs nrepos propDOIs
##   <lgl>         <int>  <int>    <dbl>
## 1 FALSE          1547    159    0.112
## 2 TRUE          12228      7    0.888
top7colors <- c("Harvard Dataverse" = "dodgerblue2",
                "Zenodo" = "darkorange1",
                "ICPSR" = "darkcyan",
                "Dryad" = "lightgray", 
                "figshare" = "purple", 
                "Institutional Repository" = "lightblue", 
                "Taylor & Francis" = "gold2")



(by_publisher_data_plot_collapse <-  by_publisher_data_dc_collapse %>% 
    filter(publisher %in% top7pubs) %>% 
    ggplot(aes(x=institution, y=count, fill=publisher)) +
    geom_bar(stat="identity", position=position_dodge(preserve = "single")) +
    scale_fill_manual(values = top7colors, name="Publisher")+
    guides(fill = guide_legend(title.position = "top")) +
    #scale_y_continuous(breaks = seq(from = 0, to=5000, by=500)) +
    #coord_cartesian(ylim = c(0,5000)) +
    labs(x = "Institution", y="Count of Collapsed Data DOIs") +
    theme_bw() +
    theme(legend.position = "bottom", legend.title.align = .5))

ggsave(by_publisher_data_plot_collapse, filename = "figures/Counts of Data DOIs by Institution_DOIcollapsed.png", device = "png",  width = 8, height = 6, units="in")

Institutional Graphs - Collapsed

Cornell

Duke

Michigan

Minnesota

Virginia Tech

Wash U

Further collapse by Version

We can also look at the data collapsed by version of a record. This was motivated because some repositories have multiple entries for the different versions of the same dataset/collection. And some entries have many versions.

Explore versions

Some Repositories attach “vX” to the doi.

all_dois_collapsed <- all_dois_collapsed %>% 
  mutate(hasversion = grepl("\\.v[[:digit:]]+$", DOI))


all_dois_collapsed %>% 
  filter(hasversion == TRUE) %>% 
  group_by(publisher, hasversion) %>% 
  summarize(count=n()) %>% 
  arrange(desc(count)) %>% 
  datatable()

Some repositories use the “VersionCount”

all_dois_collapsed %>% 
  filter(versionCount > 0) %>% 
  group_by(publisher) %>% 
  summarize(count=n(), AvgNversions = round(mean(versionCount),2)) %>% 
  arrange(desc(count)) %>% 
  datatable()

Some use “metadataVersion”

all_dois_collapsed %>% 
  filter(metadataVersion > 0) %>% 
  group_by(publisher) %>% 
  summarize(count=n(), AvgNversions = round(mean(metadataVersion),2)) %>% 
  arrange(desc(count)) %>% 
  datatable()

How to collapse by version? Maybe that’s for another day…

Write out institutional data

Write out CSV files for each institution:

  • All DOIs
  • All DOIs collapsed
for (i in unique(all_dois2$institution)) {
  all_dois %>% 
    filter(institution == i) %>% 
    write.csv(file=paste0("data_all_dois/All_dois_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
  
  all_dois_collapsed %>% 
    filter(institution == i) %>% 
    write.csv(file=paste0("data_all_dois/All_dois_collapsed_", i, gsub("-", "", Sys.Date()), ".csv"), row.names = F)
}